0. Load Libraries

packages <- function(x){
  x <- as.character(match.call()[[2]])
  if (!require(x,character.only=TRUE)){
    install.packages(pkgs=x,repos="http://cran.r-project.org")
    require(x,character.only=TRUE)
  }
}

packages(ggplot2)
packages(dplyr)
packages(googleVis)
packages(reshape)
packages(plotly)
packages(tm)
packages(RColorBrewer)
packages(wordcloud)
packages(RCurl)
op <- options(gvis.plot.tag='chart')

1. Read in Data

loan <- read.csv("/Users/catherinecao/Documents/lending_club_project/loan.csv")

loan$issue_d <- as.Date(gsub("^", "01-", loan$issue_d), format="%d-%b-%Y")

loan$Year <- format(loan$issue_d, "%Y")

2. EDA

Growth of Lending Club

Total Loan Amount by Year

amnt_df <- loan %>% select(issue_d, loan_amnt) %>% group_by(issue_d) %>% summarise(Amount = sum(loan_amnt), 
    Volume = n())


Line <- gvisLineChart(amnt_df, "issue_d", "Amount", options = list(legend = "none", 
    title = "Loan Amount Issued by Month", hAxis = "{title:'Date Issued'}", 
    vAxis = "{title:'Amount($)'}", tag = "chart"))
plot(Line)

Loan Volume by Year

Line2 <- gvisLineChart(amnt_df, "issue_d", "Volume", options = list(legend = "none", 
    title = "Loan Volume Issued by Month", hAxis = "{title:'Date Issued'}", 
    vAxis = "{title:'Volume'}"))
plot(Line2)

Average Loan by Year

# average loan
plot_avg_loan <- loan %>% select(Year, loan_amnt) %>% group_by(Year) %>% summarise(avg_loan = mean(loan_amnt))

Line_avg <- gvisLineChart(plot_avg_loan, "Year", "avg_loan", options = list(legend = "none", 
    title = "Average Loan Size by Year", hAxis = "{title:'Year Issued'}", vAxis = "{title:'Average Loan Size($)'}"))
plot(Line_avg)

Starting from 2012, Lending Club grows siginificantly.

Some key variables: Grade

Overall: Majority of the loans fell into A, B, C, D.

# grade, overall
plot_grade_pie <- as.data.frame(table(loan$grade))
grade_pie <- gvisPieChart(plot_grade_pie)
plot(grade_pie)

Grade vs. Interest: A to G, the interest increases.

p_grade_interest <- plot_ly(loan, y = ~int_rate, color = ~grade, type = "box")
p_grade_interest

Loan Volume by Grade over the years: Did not change a lot

plot_grade <- loan %>% select(Year, loan_amnt, grade) %>% group_by(Year, grade) %>% 
    summarise(total = n())


reshaped <- cast(plot_grade, Year ~ grade)

SteppedArea <- gvisSteppedAreaChart(reshaped, xvar = "Year", yvar = c("A", "B", 
    "C", "D", "E", "F", "G"), options = list(isStacked = "percent"))
plot(SteppedArea)

Loan Status

plot_status <- as.data.frame(table(loan$loan_status))
Pie <- gvisPieChart(plot_status)
plot(Pie)

Loan Amount vs. Loan Status

p_status <- plot_ly(loan, y = ~loan_amnt, color = ~loan_status, type = "box")
p_status
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

Maps

Value by State: Most of the money is borrowed by people from California.

state_by_value <- loan %>% group_by(addr_state) %>% summarise(value = sum(loan_amnt, 
    na.rm = TRUE))

GeoStates <- gvisGeoChart(state_by_value, "addr_state", "value", options = list(region = "US", 
    displayMode = "regions", resolution = "provinces", width = 600, height = 400))
plot(GeoStates)

Default by State: IA has default rate of 14% (2 out 14). Default rate in other states do not vary a lot.

# default rate * state default itself is quite small so use the broad
# defination 'bad' statuses

bad_indicators <- c("Charged Off ", "Default", "Does not meet the credit policy. Status:Charged Off", 
    "In Grace Period", "Default Receiver", "Late (16-30 days)", "Late (31-120 days)")

# assign certain statuses to a 'bad' ('0') group
loan$is_bad <- ifelse(loan$loan_status %in% bad_indicators, 0, ifelse(loan$loan_status == 
    "", NA, 1))



default_rate <- loan %>% group_by(addr_state) %>% summarise(countn = n(), sumn = sum(is_bad), 
    default_rate = (1 - sumn/countn) * 100)

GeoStates_defult <- gvisGeoChart(default_rate, "addr_state", "default_rate", 
    options = list(region = "US", displayMode = "regions", resolution = "provinces", 
        width = 600, height = 400))
plot(GeoStates_defult)

Reason for Borrowing: Top reason is debt_consolidation.

plot_purpose <- as.data.frame(table(loan$purpose))
Pie_purpose <- gvisPieChart(plot_purpose)
plot(Pie_purpose)
# Word Cloud
loan_title_corpus <- Corpus(DataframeSource(data.frame(head(loan[, 22], n = 10000))))
loan_title_corpus <- tm_map(loan_title_corpus, removePunctuation)
loan_title_corpus <- tm_map(loan_title_corpus, content_transformer(tolower))

set.seed(123)
wordcloud(loan_title_corpus, max.words = 100, random.order = FALSE, rot.per = 0.3, 
    use.r.layout = FALSE, colors = brewer.pal(8, "Paired"))

Who are borrowing: Financial Services, Educational Systems, etc.

loan_emp_corpus <- Corpus(DataframeSource(data.frame(head(loan[, 11], n = 10000))))
loan_emp_corpus <- tm_map(loan_emp_corpus, removePunctuation)
loan_emp_corpus <- tm_map(loan_emp_corpus, content_transformer(tolower))
loan_emp_corpus <- tm_map(loan_emp_corpus, removeWords, c("inc", "group", "corporation", 
    "llc", "company", "and", "corp", "institute"))

set.seed(124)
wordcloud(loan_emp_corpus, scale = c(2, 0.2), max.words = 100, random.order = FALSE, 
    rot.per = 0.3, use.r.layout = FALSE, colors = brewer.pal(8, "Paired"))

Employment Length

plot_emp_pie <- as.data.frame(table(loan$emp_length))
emp_pie <- gvisPieChart(plot_emp_pie)
plot(emp_pie)

Employment Length vs.Default Rate

plot_emp_len <- loan %>% select(emp_length, is_bad, loan_amnt) %>% group_by(emp_length, 
    is_bad) %>% summarise(countn = n())

reshaped_emp <- cast(plot_emp_len, is_bad ~ emp_length)

SteppedArea_emp <- gvisColumnChart(reshaped_emp, xvar = "is_bad", yvar = names(reshaped_emp), 
    options = list(isStacked = "percent"))
plot(SteppedArea_emp)

Home Ownership: Mortage occupies 50%

plot(gvisPieChart(as.data.frame(table(loan$home_ownership))))

Home Owership vs. default rate

plot_home <- loan %>%
  select(home_ownership, is_bad, loan_amnt) %>%
  group_by(home_ownership, is_bad) %>%
  summarise(countn = n())

reshaped_home <- cast(plot_home, is_bad ~ home_ownership)
reshaped_home <- reshaped_home[c(1,3:7)]

SteppedArea_home <- gvisColumnChart(reshaped_home, xvar="is_bad", 
                                   yvar= c("MORTGAGE", "NONE", "OTHER","OWN", "RENT"),
                                   options=list(isStacked='percent',
                                                hAxes = "[{title:'Default Status'}"))

plot(SteppedArea_home)
## Set options back to original options
options(op)

Reference: https://www.kaggle.com/erykwalczak/d/wendykan/lending-club-loan-data/initial-loan-book-analysis